configfile: "config/revision1.yaml"

# singularity: "docker://alzhang/scrna-analysis-follicular-3.5:v1.6"
singularity: "docker://alzhang/spectrum-master-rstudio:v1.4"

rule all:
    input:
        #hgsc_normalized='{workdir}/sce_hgsc_normalized.rds'.format(workdir=config['workdir']),
        #hgsc_cellassign_annotated='{workdir}/sce_hgsc_celltype_annotated.rds'.format(workdir=config['workdir']),
        #'{outdir}/cyclone/cc_result.rds'.format(outdir=config['outdir']),
        hgsc_annotated='{outdir}/sce_hgsc_annotated_final.rds'.format(outdir=config['outdir']),
        #broad_assignments='{outdir}/broad_assignments.rds'.format(outdir=config['outdir']),
        de_tables_reactome=expand('{outdir}/differential_expression/ReactomePA/site/{{celltype_group}}/{{patient_group}}.rds'.format(outdir=config['outdir']), celltype_group=config['differential_expression_settings']['ReactomePA']['celltype_groups'], patient_group=config['differential_expression_settings']['ReactomePA']['patient_groups']),
        de_tables_fgsea=expand('{outdir}/differential_expression/fgsea/site/{{celltype_group}}/{{patient_group}}.rds'.format(outdir=config['outdir']), celltype_group=config['differential_expression_settings']['fgsea']['celltype_groups'], patient_group=config['differential_expression_settings']['fgsea']['patient_groups']),
        de_tables_epithelial_clusters=expand('{outdir}/differential_expression/fgsea/epithelial_clusters/{{patient_group}}.rds'.format(outdir=config['outdir']), patient_group=config['differential_expression_settings']['fgsea']['patient_groups']),
        shih_normal_normalized='{workdir}/sce_shih_normal_normalized.rds'.format(workdir=config['workdir']),


# Create SingleCellExperiment
rule create_hgsc_sce:
    input:
        filtered_matrices=[config['hgsc_data'][x]['filtered_matrix_path'] for x in config['hgsc_data']],
    output:
        '{workdir}/sce_hgsc.rds'.format(workdir=config['workdir'])
    params:
        name='create-hgsc-sce',
        sample_names=[config['hgsc_data'][x]['dataset'] for x in config['hgsc_data']],
        timepoints=[config['hgsc_data'][x]['timepoint'] for x in config['hgsc_data']],
        sites=[config['hgsc_data'][x]['site'] for x in config['hgsc_data']],
        patients=[config['hgsc_data'][x]['patient'] for x in config['hgsc_data']],
    log:
        '{logdir}/logs/create_hgsc_sce.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/create_hgsc_sce.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/create_sce.R '
        '--sample_names {params.sample_names} '
        '--filtered_matrices {input.filtered_matrices} '
        '--timepoints {params.timepoints} '
        '--sites {params.sites} '
        '--patients {params.patients} '
        '--outfname {output} '
        '>& {log}'


# Filter and normalize SingleCellExperiment
rule preprocess_hgsc_sce:
    input:
        hgsc_raw='{workdir}/sce_hgsc.rds'.format(workdir=config['workdir']),
    output:
        hgsc_normalized='{workdir}/sce_hgsc_normalized.rds'.format(workdir=config['workdir']),
    params:
        name='preprocess-hgsc-sce',
        mito_thres=config['filter_settings']['hgsc']['mito_thres'],
        ribo_thres=config['filter_settings']['hgsc']['ribo_thres'],
    log:
        '{logdir}/logs/preprocess_hgsc_sce.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/preprocess_hgsc_sce.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/preprocess_sce.R '
        '--sce {input.hgsc_raw} '
        '--mito_thres {params.mito_thres} '
        '--ribo_thres {params.ribo_thres} '
        '--umap_neighbors 25 '
        '--umap_min_dist 0.2 '
        '--outfname {output} '
        '>& {log}'

# Cell cycle assignments (TODO: Run this in multi-core mode)
rule cyclone_hgsc_sce:
    input:
        hgsc_normalized='{workdir}/sce_hgsc_normalized.rds'.format(workdir=config['workdir']),
    output:
        cc_result='{outdir}/cyclone/cc_result.rds'.format(outdir=config['outdir']),
    params:
        name='cyclone-hgsc-sce',
    log:
        '{logdir}/logs/cyclone_hgsc_sce.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/cyclone_hgsc_sce.txt'.format(logdir=config['logdir'])
    threads: 15
    shell:
        'Rscript R/cyclone_sce.R '
        '--sce {input.hgsc_normalized} '
        '--ncpus {threads} '
        '--outfname {output} '
        '>& {log}'

# Assign cells to specific subtypes, including stromal subtypes
rule cellassign_hgsc_big:
    input:
        hgsc_normalized='{workdir}/sce_hgsc_normalized.rds'.format(workdir=config['workdir']),
        marker_list_big=config['marker_lists']['big']['path'],
    output:
        specific_assignments='{outdir}/specific_assignments.rds'.format(outdir=config['outdir']),
    params:
        name='cellassign-hgsc-big',
        include_other=config['marker_lists']['big']['include_other'],
        num_runs=config['cellassign_settings']['num_runs'],
        B=config['cellassign_settings']['B'],
        extra_opts=config['cellassign_settings']['extra_opts'],
    threads: 15
    log:
        '{logdir}/logs/cellassign_hgsc_big.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/cellassign_hgsc_big.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/cellassign_sce.R '
        '--sce {input.hgsc_normalized} '
        '--marker_list {input.marker_list_big} '
        '--include_other {params.include_other} '
        '--num_runs {params.num_runs} '
        '--rbf_pieces {params.B} '
        '{params.extra_opts} '
        '--outfname {output.specific_assignments} '
        '>& {log}'

# Assign cells to broad subtypes (i.e no T cell split)
rule cellassign_hgsc_small:
    input:
        hgsc_normalized='{workdir}/sce_hgsc_normalized.rds'.format(workdir=config['workdir']),
        marker_list_small=config['marker_lists']['small']['path'],
    output:
        broad_assignments='{outdir}/broad_assignments.rds'.format(outdir=config['outdir']),
    params:
        name='cellassign-hgsc-small',
        include_other=config['marker_lists']['small']['include_other'],
        num_runs=config['cellassign_settings']['num_runs'],
        B=config['cellassign_settings']['B'],
        extra_opts=config['cellassign_settings']['extra_opts'],
    threads: 15
    log:
        '{logdir}/logs/cellassign_hgsc_small.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/cellassign_hgsc_small.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/cellassign_sce.R '
        '--sce {input.hgsc_normalized} '
        '--marker_list {input.marker_list_small} '
        '--include_other {params.include_other} '
        '--num_runs {params.num_runs} '
        '--rbf_pieces {params.B} '
        '{params.extra_opts} '
        '--outfname {output.broad_assignments} '
        '>& {log}'

# Annotate HGSC SCE with assignments
rule annotate_cellassign_sce:
    input:
        hgsc_normalized='{workdir}/sce_hgsc_normalized.rds'.format(workdir=config['workdir']),
        specific_assignments='{outdir}/specific_assignments.rds'.format(outdir=config['outdir']),
        broad_assignments='{outdir}/broad_assignments.rds'.format(outdir=config['outdir']),
    output:
        hgsc_cellassign_annotated='{workdir}/sce_hgsc_celltype_annotated.rds'.format(workdir=config['workdir']),
    params:
        name='annotate-cellassign',
    log:
        '{logdir}/logs/annotate_cellassign.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/annotate_cellassign.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/annotate_cellassign.R '
        '--sce {input.hgsc_normalized} '
        '--broad {input.broad_assignments} '
        '--specific {input.specific_assignments} '
        '--outfname {output.hgsc_cellassign_annotated} '
        '>& {log}'

# Unsupervised clustering of epithelial cells
rule hgsc_unsupervised_epithelial:
    input:
        hgsc_cellassign_annotated='{workdir}/sce_hgsc_celltype_annotated.rds'.format(workdir=config['workdir']),
    output:
        unsupervised_epithelial_assignments='{outdir}/unsupervised_epithelial_assignments.tsv'.format(outdir=config['outdir']),
    params:
        name='hgsc-unsupervised-epithelial',
        random_seed=config['cluster_random_seed'],
        cluster_methods=config['cluster_settings']['hgsc_epithelial']['methods'],
        cluster_use_method=config['cluster_settings']['hgsc_epithelial']['use_method'],
        celltypes=config['cluster_settings']['hgsc_epithelial']['celltypes'],
    log:
        '{logdir}/logs/hgsc_unsupervised_epithelial.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/hgsc_unsupervised_epithelial.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/unsupervised_cluster_sce.R '
        '--sce {input.hgsc_cellassign_annotated} '
        '--celltypes {params.celltypes} '
        '--clustering_methods {params.cluster_methods} '
        '--clustering_method_use {params.cluster_use_method} '
        '--random_seed {params.random_seed} '
        '--outfname {output.unsupervised_epithelial_assignments} '
        '>& {log}'

# Unsupervised clustering of all cells
rule hgsc_unsupervised_all:
    input:
        hgsc_cellassign_annotated='{workdir}/sce_hgsc_celltype_annotated.rds'.format(workdir=config['workdir']),
    output:
        unsupervised_all_assignments='{outdir}/unsupervised_all_assignments.tsv'.format(outdir=config['outdir']),
    params:
        name='hgsc-unsupervised-all',
        random_seed=config['cluster_random_seed'],
        cluster_methods=config['cluster_settings']['hgsc_all']['methods'],
        cluster_use_method=config['cluster_settings']['hgsc_all']['use_method'],
    log:
        '{logdir}/logs/hgsc_unsupervised_all.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/hgsc_unsupervised_all.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/unsupervised_cluster_sce.R '
        '--sce {input.hgsc_cellassign_annotated} '
        '--clustering_methods {params.cluster_methods} '
        '--clustering_method_use {params.cluster_use_method} '
        '--random_seed {params.random_seed} '
        '--outfname {output.unsupervised_all_assignments} '
        '>& {log}'

# Unsupervised clustering of all cells, using a subset of genes (marker genes)
rule hgsc_unsupervised_all_subset:
    input:
        hgsc_cellassign_annotated='{workdir}/sce_hgsc_celltype_annotated.rds'.format(workdir=config['workdir']),
        marker_list_small=config['marker_lists']['small']['path'],
    output:
        unsupervised_all_subset_assignments='{outdir}/unsupervised_all_subset_assignments.tsv'.format(outdir=config['outdir']),
    params:
        name='hgsc-unsupervised-all-subset',
        random_seed=config['cluster_random_seed'],
        cluster_methods=config['cluster_settings']['hgsc_all_subset']['methods'],
        cluster_use_method=config['cluster_settings']['hgsc_all_subset']['use_method'],
    log:
        '{logdir}/logs/hgsc_unsupervised_all_subset.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/hgsc_unsupervised_all_subset.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/unsupervised_cluster_sce.R '
        '--sce {input.hgsc_cellassign_annotated} '
        '--clustering_methods {params.cluster_methods} '
        '--clustering_method_use {params.cluster_use_method} '
        '--marker_list {input.marker_list_small} '
        '--random_seed {params.random_seed} '
        '--outfname {output.unsupervised_all_subset_assignments} '
        '>& {log}'

# Annotate HGSC SCE with assignments
rule annotate_hgsc_final:
    input:
        hgsc_cellassign_annotated='{workdir}/sce_hgsc_celltype_annotated.rds'.format(workdir=config['workdir']),
        cc_result='{outdir}/cyclone/cc_result.rds'.format(outdir=config['outdir']),
        unsupervised_epithelial_assignments='{outdir}/unsupervised_epithelial_assignments.tsv'.format(outdir=config['outdir']),
        unsupervised_all_assignments='{outdir}/unsupervised_all_assignments.tsv'.format(outdir=config['outdir']),
        unsupervised_all_subset_assignments='{outdir}/unsupervised_all_subset_assignments.tsv'.format(outdir=config['outdir']),
    output:
        hgsc_annotated='{outdir}/sce_hgsc_annotated_final.rds'.format(outdir=config['outdir']),
    params:
        name='annotate-hgsc-final',
    log:
        '{logdir}/logs/annotate_hgsc_final.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/annotate_hgsc_final.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/annotate_hgsc_final.R '
        '--sce {input.hgsc_cellassign_annotated} '
        '--cyclone {input.cc_result} '
        '--unsupervised_epithelial {input.unsupervised_epithelial_assignments} '
        '--unsupervised_all {input.unsupervised_all_assignments} '
        '--unsupervised_all_subset {input.unsupervised_all_subset_assignments} '
        '--outfname {output.hgsc_annotated} '
        '>& {log}'

rule differential_expression_site_by_celltype:
    input:
        hgsc_annotated='{outdir}/sce_hgsc_annotated_final.rds'.format(outdir=config['outdir']),
    output:
        de_tables='{outdir}/differential_expression/ReactomePA/site/{{celltype_group}}/{{patient_group}}.rds'.format(outdir=config['outdir']),
    params:
        celltype_group=lambda wildcards: config['differential_expression_settings']['ReactomePA']['celltype_groups'][wildcards.celltype_group],
        patient_group=lambda wildcards: config['differential_expression_settings']['ReactomePA']['patient_groups'][wildcards.patient_group],
        de_method=config['differential_expression_settings']['ReactomePA']['gene_method'],
        num_genes=config['differential_expression_settings']['ReactomePA']['num_genes'],
        min_gene_counts=config['differential_expression_settings']['ReactomePA']['min_gene_counts'],
        name='differential-expression-reactome-site-{celltype_group}-{patient_group}',
    log:
        '{logdir}/logs/differential_expression_site/ReactomePA/{{celltype_group}}/{{patient_group}}.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/differential_expression_site/ReactomePA/{{celltype_group}}/{{patient_group}}.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/de_space.R '
        '--sce {input.hgsc_annotated} '
        '--celltypes {params.celltype_group} '
        '--patients {params.patient_group} '
        '--method_gene {params.de_method} '
        '--min_gene_counts {params.min_gene_counts} '
        '--method_pathway ReactomePA '
        '--ngene {params.num_genes} '
        '--outfname {output.de_tables} '
        '>& {log}'

rule differential_expression_fgsea_site:
    input:
        hgsc_annotated='{outdir}/sce_hgsc_annotated_final.rds'.format(outdir=config['outdir']),
    output:
        de_tables='{outdir}/differential_expression/fgsea/site/{{celltype_group}}/{{patient_group}}.rds'.format(outdir=config['outdir']),
    params:
        celltype_group=lambda wildcards: config['differential_expression_settings']['fgsea']['celltype_groups'][wildcards.celltype_group],
        patient_group=lambda wildcards: config['differential_expression_settings']['fgsea']['patient_groups'][wildcards.patient_group],
        de_method=config['differential_expression_settings']['fgsea']['gene_method'],
        min_gene_counts=config['differential_expression_settings']['fgsea']['min_gene_counts'],
        gene_set_file=config['gene_sets']['hallmark'],
        name='differential-expression-site-fgsea-{celltype_group}-{patient_group}',
    log:
        '{logdir}/logs/differential_expression_site/fgsea/{{celltype_group}}/{{patient_group}}.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/differential_expression_site/fgsea/{{celltype_group}}/{{patient_group}}.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/de_space.R '
        '--sce {input.hgsc_annotated} '
        '--celltypes {params.celltype_group} '
        '--patients {params.patient_group} '
        '--method_gene {params.de_method} '
        '--min_gene_counts {params.min_gene_counts} '
        '--method_pathway fgsea '
        '--gene_set_file {params.gene_set_file} '
        '--outfname {output.de_tables} '
        '>& {log}'

rule differential_expression_epithelial_clusters:
    input:
        hgsc_annotated='{outdir}/sce_hgsc_annotated_final.rds'.format(outdir=config['outdir']),
    output:
        de_tables='{outdir}/differential_expression/fgsea/epithelial_clusters/{{patient_group}}.rds'.format(outdir=config['outdir']),
    params:
        patient_group=lambda wildcards: config['differential_expression_settings']['ReactomePA']['patient_groups'][wildcards.patient_group],
        cluster_col='epithelial_cluster',
        min_gene_counts=config['differential_expression_settings']['fgsea']['min_gene_counts'],
        gene_set_file=config['gene_sets']['hallmark'],
        name='differential-expression-fgsea-epithelial-clusters-{patient_group}',
    log:
        '{logdir}/logs/differential_expression_epithelial_clusters/fgsea/{{patient_group}}.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/differential_expression_epithelial_clusters/fgsea/{{patient_group}}.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/de_epithelial_clusters.R '
        '--sce {input.hgsc_annotated} '
        '--patients {params.patient_group} '
        '--cluster_col {params.cluster_col} '
        '--min_gene_counts {params.min_gene_counts} '
        '--gene_set_file {params.gene_set_file} '
        '--outfname {output.de_tables} '
        '>& {log}'

## SHIH

# Create SingleCellExperiment
rule create_shih_normal_sce:
    input:
        umicount_files=[config['shih_normal_data'][x]['umicount_path'] for x in config['shih_normal_data']],
    output:
        '{workdir}/sce_shih_normal.rds'.format(workdir=config['workdir'])
    params:
        name='create-shih-normal-sce',
        sample_names=[config['shih_normal_data'][x]['dataset'] for x in config['shih_normal_data']],
        timepoints=[config['shih_normal_data'][x]['timepoint'] for x in config['shih_normal_data']],
        sites=[config['shih_normal_data'][x]['site'] for x in config['shih_normal_data']],
        patients=[config['shih_normal_data'][x]['patient'] for x in config['shih_normal_data']],
    log:
        '{logdir}/logs/create_shih_normal_sce.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/create_shih_normal_sce.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/create_umicount_sce.R '
        '--sample_names {params.sample_names} '
        '--umicount_files {input.umicount_files} '
        '--timepoints {params.timepoints} '
        '--sites {params.sites} '
        '--patients {params.patients} '
        '--outfname {output} '
        '>& {log}'


# Filter and normalize SingleCellExperiment
rule preprocess_shih_normal_sce:
    input:
        shih_normal_raw='{workdir}/sce_shih_normal.rds'.format(workdir=config['workdir']),
    output:
        shih_normal_normalized='{workdir}/sce_shih_normal_normalized.rds'.format(workdir=config['workdir']),
    params:
        name='preprocess-shih-normal-sce',
        mito_thres=config['filter_settings']['shih']['mito_thres'],
        ribo_thres=config['filter_settings']['shih']['ribo_thres'],
    log:
        '{logdir}/logs/preprocess_shih_normal_sce.log'.format(logdir=config['logdir'])
    benchmark:
        '{logdir}/benchmarks/preprocess_shih_normal_sce.txt'.format(logdir=config['logdir'])
    shell:
        'Rscript R/preprocess_sce.R '
        '--sce {input.shih_normal_raw} '
        '--mito_thres {params.mito_thres} '
        '--ribo_thres {params.ribo_thres} '
        '--outfname {output} '
        '>& {log}'